── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr 1.1.2 ✔ readr 2.1.4
✔ forcats 1.0.0 ✔ stringr 1.5.0
✔ ggplot2 3.4.3 ✔ tibble 3.2.1
✔ lubridate 1.9.2 ✔ tidyr 1.3.0
✔ purrr 1.0.2
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag() masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)library(splines)library(glmnet)
Loading required package: Matrix
Attaching package: 'Matrix'
The following objects are masked from 'package:tidyr':
expand, pack, unpack
Loaded glmnet 4.1-7
library(timetk)library(caret)
Loading required package: lattice
Attaching package: 'caret'
The following object is masked from 'package:purrr':
lift
library(tree)library(rdist)# Q1 # Load Databike_data <-read.csv('bike_rental_hourly.csv')bike_data$log_cnt <-log(bike_data$cnt)bike_data$hour <- bike_data$hr/23bike_data_train <- bike_data[bike_data$dteday >=as.Date("2011-02-01") & bike_data$dteday <=as.Date("2011-03-31"), ]bike_data_test <- bike_data[bike_data$dteday >=as.Date("2011-04-01") & bike_data$dteday <=as.Date("2011-05-31"), ]y_test <- bike_data_test$log_cnty_train <- bike_data_train$log_cntp <-2X_train <-cbind(1, poly(bike_data_train$hour, 2, raw =TRUE, simple =TRUE))beta_hat <-solve(t(X_train)%*%X_train)%*%t(X_train)%*%y_train# Predict in-sample and compute RMSEy_hat_train <- X_train%*%beta_hat RMSE_train <-sqrt(sum((y_train - y_hat_train)^2)/length(y_train))# Predict out-of-sample and compute RMSEX_test <-cbind(1, poly(bike_data_test$hour, p, raw =TRUE, simple =TRUE))y_hat_test <- X_test%*%beta_hatRMSE_test <-sqrt(sum((y_test - y_hat_test)^2)/length(y_test))# Plot training data, test data, and fit on a fine grid.plot(log_cnt ~ hour, data = bike_data_train, col ="cornflowerblue", ylim =c(0, 8))lines(bike_data_test$hour, bike_data_test$log_cnt, type ="p", col ="lightcoral")hours_grid <-seq(0, 1, length.out =1000)X_grid <-cbind(1, poly(hours_grid, p, raw =TRUE, simple =TRUE))y_hat_grid <- X_grid%*%beta_hatlines(hours_grid, y_hat_grid, lty =1, col ="lightcoral")legend(x ="topleft", pch =c(1, 1, NA), lty =c(NA, NA, 1), col =c("cornflowerblue", "lightcoral", "lightcoral"), legend=c("Train", "Test", "Fitted curve"))
Q1
1.1
y_test <- bike_data_test$log_cnty_train <- bike_data_train$log_cntp <-8X_train <-cbind(1, poly(bike_data_train$hour, p, raw =TRUE, simple =TRUE))beta_hat <-solve(t(X_train)%*%X_train)%*%t(X_train)%*%y_train# Predict in-sample and compute RMSEy_hat_train <- X_train%*%beta_hat RMSE_train <-sqrt(sum((y_train - y_hat_train)^2)/length(y_train))# Predict out-of-sample and compute RMSEX_test <-cbind(1, poly(bike_data_test$hour, p, raw =TRUE, simple =TRUE))y_hat_test <- X_test%*%beta_hatRMSE_test <-sqrt(sum((y_test - y_hat_test)^2)/length(y_test))# Plot training data, test data, and fit on a fine grid.plot(log_cnt ~ hour, data = bike_data_train, col ="cornflowerblue", ylim =c(0, 8), main ="8th Order Polynomial Regression")lines(bike_data_test$hour, bike_data_test$log_cnt, type ="p", col ="lightcoral")hours_grid <-seq(0, 1, length.out =1000)X_grid <-cbind(1, poly(hours_grid, p, raw =TRUE, simple =TRUE))y_hat_grid <- X_grid%*%beta_hatlines(hours_grid, y_hat_grid, lty =1, col ="lightcoral")legend(x ="topleft", pch =c(1, 1, NA), lty =c(NA, NA, 1), col =c("cornflowerblue", "lightcoral", "lightcoral"), legend=c("Train", "Test", "Fitted curve"))
1.2
RMSE_train <-numeric(length =10)RMSE_test <-numeric(length =10)for (i in1:10) { p <- i X_train <-cbind(1, poly(bike_data_train$hour, p, raw =TRUE, simple =TRUE)) beta_hat <-solve(t(X_train)%*%X_train)%*%t(X_train)%*%y_train# Predict in-sample and compute RMSE y_hat_train <- X_train%*%beta_hat RMSE_train[i] <-sqrt(sum((y_train - y_hat_train)^2)/length(y_train))# Predict out-of-sample and compute RMSE X_test <-cbind(1, poly(bike_data_test$hour, p, raw =TRUE, simple =TRUE)) y_hat_test <- X_test%*%beta_hat RMSE_test[i] <-sqrt(sum((y_test - y_hat_test)^2)/length(y_test))}plot(1:10, RMSE_train, type ="b", col ="cornflowerblue", xlab ="Polynomial Order", ylab ="RMSE",main ="RMSE vs Polynomial Order", ylim=c(0,2))lines(1:10, RMSE_test, type ="b", col ="lightcoral")legend("topright", legend =c("Train RMSE", "Test RMSE"), col =c("cornflowerblue", "lightcoral"), lty =1)
As we increase the order of the polynomial we can see both the training and the test error are declining, with the test error being consistently above the training error. The decrease in test error appears to plateau around order 8, meaning anything above this value would be an unnecessarily complex model and thus would be over fitting the training data. Therefore an 8th order polynomial regression seems to be a good fit for this example.
1.3
y_test <- bike_data_test$log_cnty_train <- bike_data_train$log_cntp <-8X_train <-cbind(1, poly(bike_data_train$hour, p, raw =TRUE, simple =TRUE))beta_hat <-solve(t(X_train)%*%X_train)%*%t(X_train)%*%y_train# Predict in-sample and compute RMSEy_hat_train <- X_train%*%beta_hat RMSE_train <-sqrt(sum((y_train - y_hat_train)^2)/length(y_train))# Predict out-of-sample and compute RMSEX_test <-cbind(1, poly(bike_data_test$hour, p, raw =TRUE, simple =TRUE))y_hat_test <- X_test%*%beta_hatRMSE_test <-sqrt(sum((y_test - y_hat_test)^2)/length(y_test))#Nonparametric local regressionloess_model <-loess(log_cnt ~ hour, data = bike_data_train)loess_test <-predict(loess_model, bike_data_test)loess_RMSE <-sqrt(sum((y_test - loess_test)^2)/length(y_test))local_plot <-predict(loess_model, data.frame(hour =seq(0, 1, length.out=1000), se =TRUE))# Plot training data, test data, and fit on a fine grid.plot(log_cnt ~ hour, data = bike_data_train, col ="cornflowerblue", ylim =c(0, 8))lines(bike_data_test$hour, bike_data_test$log_cnt, type ="p", col ="lightcoral")hours_grid <-seq(0, 1, length.out =1000)X_grid <-cbind(1, poly(hours_grid, p, raw =TRUE, simple =TRUE))y_hat_grid <- X_grid%*%beta_hatlines(hours_grid, y_hat_grid, lty =1, col ="lightcoral")lines(hours_grid, local_plot, lty =1, col ="green")legend(x ="topleft", pch =c(1, 1, NA, NA), lty =c(NA, NA, 1, 1), col =c("cornflowerblue", "lightcoral", "lightcoral", "green"), legend=c("Train", "Test", "Poly Fitted curve", "Smoothed Fit"))
cat("The RMSE for Polynomial Regression is ", RMSE_test)
The RMSE for Polynomial Regression is 1.021448
cat("\nThe RMSE for Loess Fit is ", loess_RMSE)
The RMSE for Loess Fit is 1.120946
When comparing results on the test data for each model we can see the RMSE for the Polynomial regression comes out to be: 1.021448 compared to 1.120946 for the locally fitted method. Therefore with the standard settings the polynomial regression outperforms the locally fitted model on the test data.
Q2
set.seed(123)suppressMessages(library(splines))knots <-seq(0.05, 0.95, length.out =25)X_train <-ns(bike_data_train$hour, knots = knots, intercept =TRUE)X_test <-ns(bike_data_test$hour, knots = knots, intercept =TRUE)beta_hat <-solve(t(X_train)%*%X_train)%*%t(X_train)%*%y_train# Plot training data, test data, and spline fit on a fine grid.plot(log_cnt ~ hour, data = bike_data_train, col ="cornflowerblue", ylim =c(0, 8))lines(bike_data_test$hour, bike_data_test$log_cnt, type ="p", col ="lightcoral")hours_grid <-seq(0, 1, length.out =1000)X_grid <-ns(hours_grid, knots = knots, intercept =TRUE) # cbind(1, ns(hours_grid, knots = knots))y_hat_spline_grid <- X_grid%*%beta_hatlines(hours_grid, y_hat_spline_grid, lty =1, col ="lightcoral")legend(x ="topleft", pch =c(1, 1, NA), lty =c(NA, NA, 1), col =c("cornflowerblue", "lightcoral", "lightcoral"), legend=c("Train", "Test", "Spline"))
# Q2.2ridge_model <-glmnet(X_train, y_train, alpha =0)cv <-cv.glmnet(X_train, y_train, alpha =0)optimal_lambda_ridge <- cv$lambda.1seridge_train <-predict(ridge_model, s = optimal_lambda_ridge, newx = X_train)ridge_test <-predict(ridge_model, s = optimal_lambda_ridge, newx = X_test)ridge_train_plot <-predict(ridge_model, s = optimal_lambda_ridge, newx = X_grid)ridge_RMSE_train <-sqrt(sum((y_train - ridge_train)^2)/length(y_train))ridge_RMSE_predict <-sqrt(sum((y_test - ridge_test)^2)/length(y_test))plot(log_cnt ~ hour, data = bike_data_train, col ="cornflowerblue", ylim =c(0, 8), main ="Ridge Regression with 1 SE Optimal Lambda")lines(bike_data_test$hour, bike_data_test$log_cnt, type ="p", col ="lightcoral")hours_grid <-seq(0, 1, length.out =1000)X_grid <-ns(hours_grid, knots = knots, intercept =TRUE) # cbind(1, ns(hours_grid, knots = knots))y_hat_spline_grid <- X_grid%*%beta_hat_newlines(hours_grid, ridge_train_plot, type ="l", col ="green")legend(x ="topleft", pch =c(1, 1, NA, NA), lty =c(NA, NA, 1, 1), col =c("cornflowerblue", "lightcoral", "green"), legend=c("Train", "Test" ,"Ridge Fit"))
cat("The RMSE for the training set is: ", ridge_RMSE_train)
The RMSE for the training set is: 0.7111962
cat("\nThe RMSE for test set is ", ridge_RMSE_predict)
The RMSE for test set is 1.000773
2.3
# Q2.3min_optimal_lambda <- cv$lambda.minridge_train_min <-predict(ridge_model, s = min_optimal_lambda, newx = X_train)ridge_test_min <-predict(ridge_model, s = min_optimal_lambda, newx = X_test)ridge_RMSE_train_min <-sqrt(sum((y_train - ridge_train_min)^2)/length(y_train))ridge_RMSE_predict_min <-sqrt(sum((y_test - ridge_test_min)^2)/length(y_test))cat("The RMSE for the training set is: ", ridge_RMSE_train_min)
The RMSE for the training set is: 0.6919639
cat("\nThe RMSE for test set is ", ridge_RMSE_predict_min)
The RMSE for test set is 1.002668
Previous results:
cat("The RMSE for the training set is: ", ridge_RMSE_train)
The RMSE for the training set is: 0.7111962
cat("\nThe RMSE for test set is ", ridge_RMSE_predict)
The RMSE for test set is 1.000773
When comparing the model performance of the two models we can see that the minimum lambda model has the lower training error however the 1se lambda model has a slightly lower test error, meaning the simpler model has generalized better to the test data.
2.4
lasso_model <-glmnet(X_train, y_train, alpha =1)cv_lasso <-cv.glmnet(X_train, y_train, alpha =1)optimal_lambda_lasso <- cv_lasso$lambda.1selasso_train <-predict(lasso_model, s = optimal_lambda_lasso, newx = X_train)lasso_test <-predict(lasso_model, s = optimal_lambda_lasso, newx = X_test)lasso_RMSE_train <-sqrt(sum((y_train - lasso_train)^2)/length(y_train))lasso_RMSE_predict <-sqrt(sum((y_test - lasso_test)^2)/length(y_test))cat("The Lasso Model RMSE for the training set is: ", lasso_RMSE_train)
The Lasso Model RMSE for the training set is: 0.7187198
cat("\nThe Lasso Model RMSE for test set is ", lasso_RMSE_predict)
The Lasso Model RMSE for test set is 1.006538
Both the Lasso and Ridge models produce comparable performance on the training and data sets, with the Ridge regression model using a 1 standard deviation lambda input being the best performing model in terms of out of sample performance.
Q3
# One hot for weathersitone_hot_encode_weathersit <-model.matrix(~as.factor(weathersit) -1,data = bike_data)one_hot_encode_weathersit <- one_hot_encode_weathersit[, -1] # Remove reference categorycolnames(one_hot_encode_weathersit) <-c('cloudy', 'light rain', 'heavy rain')bike_data <-cbind(bike_data, one_hot_encode_weathersit)# One hot for weekdayone_hot_encode_weekday <-model.matrix(~as.factor(weekday) -1,data = bike_data)one_hot_encode_weekday <- one_hot_encode_weekday[, -1] # Remove reference categorycolnames(one_hot_encode_weekday) <-c('Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat')bike_data <-cbind(bike_data, one_hot_encode_weekday)# One hot for weekdayone_hot_encode_season <-model.matrix(~as.factor(season) -1,data = bike_data)one_hot_encode_season <- one_hot_encode_season[, -1] # Remove reference categorycolnames(one_hot_encode_season) <-c('Spring', 'Summer', 'Fall')bike_data <-cbind(bike_data, one_hot_encode_season)head(bike_data)
By plotting the coefficient values against the value of Lambda we can see that there are 3 features that go to 0 last looking at the right side of the plot. We can Access those coefficients via the coef function used above with a large enough lambda as the input, the resulting table tells us that hour1 (feature 1), hour9 (feature 9), and atemp (feature 15) are the most important features for the prescribed Lasso model.
When plotting the autocorrelation function of both the training and test residuals we can see evidence of significant positive autocorrelation up to around lag 4. This is evidence of the independently and identically distributed errors condition being violated.
Q4
4.1
response <- design_data_test[design_data_test$dteday >=as.Date("2012-12-25") & design_data_test$dteday <=as.Date("2012-12-31"),]response <- response$hrcorresponding_values <-tail(lasso_test, length(response))plot(response, corresponding_values, xlab ="Hour", ylab ="Fitted Values", main ="Actual vs. Fitted Values", col ="cornflowerblue", pch =16)
4.2
lagged_training <- design_data_train %>%tk_augment_lags(contains("log_cnt"), .lags =1:4)lag_24 <- design_data_train %>%tk_augment_lags(contains("log_cnt"), .lags =24)lagged_training <-cbind(lagged_training, lag_24["log_cnt_lag24"])lagged_test <- design_data_test %>%tk_augment_lags(contains("log_cnt"), .lags =1:4)lag_24_test <- design_data_test %>%tk_augment_lags(contains("log_cnt"), .lags =24)lagged_test <-cbind(lagged_test, lag_24_test["log_cnt_lag24"])features <-c("hour", "yr", "holiday", "workingday", "temp", "atemp", "hum", "windspeed","cloudy","light rain","heavy rain","Mon","Tue","Wed","Thu","Fri","Sat","Spring","Summer","Fall", "log_cnt_lag1", "log_cnt_lag2", "log_cnt_lag3", "log_cnt_lag4", "log_cnt_lag24")lagged_train_matrix <- lagged_training[features]lagged_test_matrix <- lagged_test[features]lagged_train_matrix <-na.omit(lagged_train_matrix)lagged_test_matrix <-na.omit(lagged_test_matrix)X_train <-model.matrix(~ ., data = lagged_train_matrix)y_train <- design_data_train$log_cnty_train_lagged <- y_train[25:length(y_train)]lagged_lasso_model <-glmnet(X_train, y_train_lagged, alpha =1)cv_lagged_lasso <-cv.glmnet(X_train, y_train_lagged, alpha =1)optimal_lagged_lasso <- cv_lagged_lasso$lambda.1seX_test <-model.matrix(~ ., data = lagged_test_matrix)y_test <- design_data_test$log_cnty_test_lagged <- y_test[25:length(y_test)]lagged_lasso_train <-predict(lagged_lasso_model, s = optimal_lagged_lasso, newx = X_train)lagged_lasso_test <-predict(lagged_lasso_model, s = optimal_lagged_lasso, newx = X_test)lagged_lasso_RMSE_train <-sqrt(sum((y_train_lagged - lagged_lasso_train)^2)/length(y_train_lagged))lagged_lasso_RMSE_predict <-sqrt(sum((y_test_lagged - lagged_lasso_test)^2)/length(y_test_lagged))cat("The Lasso Model RMSE for the training set with lags is: ", lagged_lasso_RMSE_train)
The Lasso Model RMSE for the training set with lags is: 0.4242765
cat("\nThe Lasso Model RMSE for test set with extra lags is ", lagged_lasso_RMSE_predict)
The Lasso Model RMSE for test set with extra lags is 0.3615772
When comparing the RMSE values and ACF plots for the lagged lasso model we can see the addition of lags in our model has led to a significant improvement in performance on both the training and test data, as well as reducing the autocorrelation between residuals leading to a much more adequate model.
4.3
lagged_values <-tail(lagged_lasso_test, length(response))plot(response, corresponding_values, xlab ="Actual Counts", ylab ="Fitted Values", main ="Actual vs. Fitted Values", col ="cornflowerblue", pch =16)points(response, lagged_values, col ="lightcoral", pch =16)legend(x ="bottomright", pch =c(16, 16), col =c("cornflowerblue", "lightcoral"), legend=c("Original Fit", "Lagged Fit"))
Q5
5.1
df <-as.data.frame(X_train)df_test <-as.data.frame(X_test)train_tree_df <-setNames(df, c("Intercept", paste0("Var", 1:34)))test_tree_df <-setNames(df_test, c("Intercept", paste0("Var", 1:34)))tree_model <-tree(y_train_lagged ~ ., train_tree_df)tree_predict <-predict(tree_model, newdata = test_tree_df)tree_RMSE <-sqrt(sum((y_test_lagged - tree_predict)^2)/length(y_test_lagged))cat("\nThe Tree Model RMSE for test set with extra lags is ", tree_RMSE)
The Tree Model RMSE for test set with extra lags is 0.5991466
5.2
plot(tree_model)text(tree_model, pretty =0)
5.3
tree_values <-tail(tree_predict, length(response))plot(response, corresponding_values, xlab ="Actual Counts", ylab ="Fitted Values", main ="Actual vs. Fitted Values", col ="cornflowerblue", pch =16)points(response, lagged_values, col ="lightcoral", pch =16)points(response, tree_values, col ="green", pch =16)legend(x ="bottomright", pch =c(16, 16, 16), col =c("cornflowerblue", "lightcoral", "green"), legend=c("Original Fit", "Lagged Fit", "Tree Fit"))
print(paste("Percentage of spam:", 100*mean(Spam_ham_emails$spam =="spam")))
[1] "Percentage of spam: 39.4044772875462"
set.seed(1234)suppressMessages(library(caret))train_obs <-createDataPartition(y = Spam_ham_emails$spam, p = .75, list =FALSE)train <- Spam_ham_emails[train_obs, ]test <- Spam_ham_emails[-train_obs, ]# Confirm both training and test are balanced with respect to spam emailsprint(paste("Percentage of training data consisting of spam emails:", 100*mean(train$spam =="spam")))
[1] "Percentage of training data consisting of spam emails: 39.4088669950739"
print(paste("Percentage of test data consisting of spam emails:", 100*mean(test$spam =="spam")))
[1] "Percentage of test data consisting of spam emails: 39.3913043478261"
glm_fit <-glm(spam ~ ., family = binomial, data = train)
Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
y_prob_hat_test <-predict(glm_fit, newdata = test, type ="response")threshold <-0.5# Predict spam if probability > thresholdy_hat_test <-as.factor(y_prob_hat_test > threshold)levels(y_hat_test) <-c("not spam", "spam")confusionMatrix(data = y_hat_test, test$spam, positive ="spam")
Confusion Matrix and Statistics
Reference
Prediction not spam spam
not spam 658 65
spam 39 388
Accuracy : 0.9096
95% CI : (0.8915, 0.9255)
No Information Rate : 0.6061
P-Value [Acc > NIR] : < 2e-16
Kappa : 0.8087
Mcnemar's Test P-Value : 0.01423
Sensitivity : 0.8565
Specificity : 0.9440
Pos Pred Value : 0.9087
Neg Pred Value : 0.9101
Prevalence : 0.3939
Detection Rate : 0.3374
Detection Prevalence : 0.3713
Balanced Accuracy : 0.9003
'Positive' Class : spam
The ROC curve plots the true positive rate (or sensitivity) of the classifier against the false positive rate (1 - specificity) at different classification thresholds. The Area Under the Curve (AUC) of the ROC plot can be used as a criterion to measure the usefulness of the classification models ability/predictions, the closer to the upper left corner curve reaches the better the performance. A useless model would have an ROC curve that lies flat on the 45 degree line. In our case we have an AUC of 0.965, this means there is only a small overlap between the distributions of the TP and TN rates implied by our model and that our classifier has a high level of separability/performance.
When comparing the miss classification rate between the three models on the test data, we can see that the logistic regression model performs best and the KNN model performs worst.